Aplp1_Analysis
Analysis of 3'UTR usage in neural stem cells of the subventricular zone (Preprint: Goepferich & George et al. 2020)
Final plots in the paper may differ slightly due to:
- figures are not in order as in the paper (due to the intrinsic logic of this script and the input data)
- changes pending on the peer-review process
- collecting and final editing of sub-plots in Adobe Illustrator
- differences R package versions or other parameters
Note: This is a reporting file providing all the necessary programming code to comprehend the main results of the paper.
Note: Precompield files are provided in this directory.
Note: Computed as HTML-File
Downstream Analysis of in-vivo 3'UTRs (APLP1 KO + WT)
... load the previously created 3'mapping file
# load the stored 3'UTR object
load(file = '/Volumes/g381-daten2/goepferich/3UTR_REANALYSIS_2019/aplp1_downstream/aplp1Filtered.rda')... some basic metrices & 3'tail-peak analysis
length( aplp1Filtered@NAMES ) # number of exons## [1] 9754
length( unique( names(aplp1Filtered@NAMES ) ) ) # number of genes## [1] 9099
sum( tapply(aplp1Filtered@NAMES, names(aplp1Filtered@NAMES), length ) > 1 ) # number of genes with multiple exons (used)## [1] 487
apa <- get_mfsc(aplp1Filtered, 'features' ) %>% unlist()
apa <- apa > 1
table(apa) # TRUE, exons with APA events## apa
## FALSE TRUE
## 4224 5530
# subset to genes with only (potentially) APA events
apaFiltered <- aplp1Filtered[apa]... create some nice lists with the S4 approach
These lists are required for the transcript-by-transcript approach
# create lists of the required input-values
ctab_list <- get_mfsc(apaFiltered, 'counts')
clen_list <- get_mfsc(apaFiltered, 'cellMean')
peak_list <- get_mfsc(apaFiltered, 'peaks')
supp_list <- get_mfsc(apaFiltered, 'support')
anno_list <- get_annotation(apaFiltered)
# The list of isoform count tables!
str( head(ctab_list, n = 2) )## List of 2
## $ ENSMUSG00000033813_3: int [1:30, 1:4] 0 1 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:30] "ACTGAACAGTTAGGTA_KO" "AGAGCGAGTAAGTTCC_WT" "AGCCTAATCTGGCGTG_KO" "AGGGAGTCACGTCTCT_KO" ...
## .. ..$ : NULL
## $ ENSMUSG00000033793 : int [1:49, 1:2] 0 1 1 0 1 1 7 1 1 1 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:49] "AACGTTGCATGGATGG_WT" "AACTCAGTCCGTACAA_WT" "ACACTGAAGAGTTGGC_WT" "ACATACGAGGCATTGG_WT" ...
## .. ..$ : NULL
# The list of 3'UTR lengths!
str( head(clen_list, n = 2) )## List of 2
## $ ENSMUSG00000033813_3: num [1:30] 743 291 600 600 1540 ...
## $ ENSMUSG00000033793 : num [1:49] 372 277 280 372 276 ...
# The list of matched cell annotations
str( head(anno_list, n = 2) )## List of 2
## $ ENSMUSG00000033813_3: tibble [30 × 4] (S3: tbl_df/tbl/data.frame)
## ..$ CellBarcode: chr [1:30] "ACTGAACAGTTAGGTA_KO" "AGAGCGAGTAAGTTCC_WT" "AGCCTAATCTGGCGTG_KO" "AGGGAGTCACGTCTCT_KO" ...
## ..$ Genotype : chr [1:30] "KO" "WT" "KO" "KO" ...
## ..$ Replicate : chr [1:30] "mouse_2" "mouse_3" "mouse_3" "mouse_1" ...
## ..$ pTime : num [1:30] 2.236 0.767 5.275 5.113 1.383 ...
## $ ENSMUSG00000033793 : tibble [49 × 4] (S3: tbl_df/tbl/data.frame)
## ..$ CellBarcode: chr [1:49] "AACGTTGCATGGATGG_WT" "AACTCAGTCCGTACAA_WT" "ACACTGAAGAGTTGGC_WT" "ACATACGAGGCATTGG_WT" ...
## ..$ Genotype : chr [1:49] "WT" "WT" "WT" "WT" ...
## ..$ Replicate : chr [1:49] "mouse_2" "mouse_2" "mouse_0" "mouse_2" ...
## ..$ pTime : num [1:49] 0.89 0.598 0.486 0.823 4.986 ...
... multinomial regression analysis with the 'nnet' package
Test for differential 3'UTR usage (APLP1-/- vs. WT) --> LRT-statistic
The hypothesis is that 3'UTR isoform choice depends on the genotype
# apply multinomial regression
pbulk_pval <- pbmapply(function(x, y, dof){
ko_th <- colSums(x[y$Genotype == 'KO', ,drop = F] > 0) > 2 # pseudo-bulk over Aplp1-KOs
wt_th <- colSums(x[y$Genotype == 'WT', ,drop = F] > 0) > 2 # pseudo-bulk over WTs
use_apa <- ko_th | wt_th
x <- x[ ,use_apa, drop = FALSE]
if( sum(use_apa) > 1 & sum(y$Genotype == 'KO') > 9 & sum(y$Genotype == 'WT') > 9 ){
x_ <- rowsum(x, group = paste(y$Genotype, y$Replicate) )
if( all(rowSums(x_) > 2) ){
fit1 <- nnet::multinom(x_ ~ str_split(rownames(x_), '', simplify = TRUE)[ ,1] , trace = FALSE )
fit0 <- nnet::multinom(x_ ~ 1, trace = FALSE )
pval <- anova(fit1, fit0)$'Pr(Chi)'[2]
} else {
pval <- NA
}
} else {
pval <- NA
}
return(pval)
}, x = ctab_list, y = anno_list, SIMPLIFY = TRUE )
# p-value (MNR): effect of the genotype on 3'UTR usage
str( head(pbulk_pval, n = 5) )## Named num [1:5] NA 0.7715 0.0329 NA 0.0264
## - attr(*, "names")= chr [1:5] "ENSMUSG00000033813_3" "ENSMUSG00000033793" "ENSMUSG00000025907_4" "ENSMUSG00000033740" ...
# select the genes (by index) after FDR correction
sel <- which(p.adjust(pbulk_pval, method = 'fdr') < .05) ... comparison of the pseudotime density (average lineage progression state) between genotypes
ggplot(aplp1Filtered@metadata, aes(pTime, fill = Genotype)) + geom_density(alpha = 0.5) + theme_minimal()... the Earth Mover's Distance (EMD) - a mesaure for differential 3'UTR usage between both genotypes
emd <- mapply(function(x, y){
ko <- colSums(x[y$Genotype == 'KO' , ,drop = F])
wt <- colSums(x[y$Genotype == 'WT' , ,drop = F])
wt_norm <- wt/sum(wt)
ko_norm <- ko/sum(ko)
emd <- sum( cumsum( wt_norm - ko_norm ) )
return(emd)
}, x = ctab_list, y = anno_list, SIMPLIFY = TRUE ) #%>% log2()
# Earth Mover's Distance, a metric fot effect of the genotype
str( head(pbulk_pval, n = 5) )## Named num [1:5] NA 0.7715 0.0329 NA 0.0264
## - attr(*, "names")= chr [1:5] "ENSMUSG00000033813_3" "ENSMUSG00000033793" "ENSMUSG00000025907_4" "ENSMUSG00000033740" ...
... plot these results!
trend <- rep('n.s.', times = length(emd))
trend[sel] <- 'MNR < FDR 5%'
aputr_tbl <- tibble(Expression = sapply(supp_list, sum),
EMD = emd,
Trend = trend,
Gene = apaFiltered@NAMES)
aputr_tbl$Expression[aputr_tbl$Expression > 5000] <- 5000 # restrict genes with extremely high expression (outliers)
table(aputr_tbl$Trend)##
## MNR < FDR 5% n.s.
## 971 4559
# mark some selected genes
selected <- c('ENSMUSG00000098754', 'ENSMUSG00000024109', 'ENSMUSG00000022285',
'ENSMUSG00000024146', 'ENSMUSG00000094483', 'ENSMUSG00000069769' )
# Prn, Nrxn1, Ywhaz, Cript, Purb, Msi2
gg_ma_geno <- ggplot(aputr_tbl, aes(y = EMD, x = Expression, colour = Trend, label = Gene )) +
geom_point( data = filter(aputr_tbl, Expression == 5000 ), size = .75, shape = 23) +
geom_point( data = filter(aputr_tbl, Trend == 'n.s.' ), color = 'lightgrey') +
geom_point( data = filter(aputr_tbl, Trend == 'MNR < FDR 5%' ), color = 'firebrick4') +
geom_point( data = filter(aputr_tbl, Trend == 'MNR < FDR 5%' ), shape = 1) +
geom_text_repel(colour = 'black',
nudge_x = 1,
force = 1,
box.padding = 1,
segment.alpha = .5,
data = filter(aputr_tbl, Gene %in% selected),
mapping = aes(label = c('Prn', 'Purb', 'Msi2', 'Ywhaz', 'Cript', 'Nrxn1') )
) +
geom_hline(yintercept = 0, col = 'black') +
theme_minimal() + theme(legend.position = 'top') +
scale_x_sqrt( breaks = scales::pretty_breaks(n = 3) ) +
scale_colour_manual(values = c( 'black', 'black'), na.value = 'lightgrey') +
labs(x = 'Expression [UMIs]', caption = '*red points = FDR < 5%\nGenotype Effect',
y = "3'UTR PAS Usage\n(EMD) APLP1-/- vs. WT") +
guides(colour = FALSE)
gg_ma_genoGene Ontology and Disease Ontology analysis (Genotype effect on 3'UTR usage)
This chunk shows the example code; results will vary with the random seed and the version of the annotation packages
# This time compute the LRT statistic
lrt <- pbmapply(function(x, y, dof){
ko_th <- colSums(x[y$Genotype == 'KO', ,drop = F] > 0) > 0
wt_th <- colSums(x[y$Genotype == 'WT', ,drop = F] > 0) > 0
use_apa <- ko_th | wt_th
x <- x[ ,use_apa, drop = FALSE]
if( sum(use_apa) > 1 & sum(y$Genotype == 'KO') > 0 & sum(y$Genotype == 'WT') > 0 ){
x_ <- rowsum(x, group = paste(y$Genotype, y$Replicate) )
if( all(rowSums(x_) > 0) ){
fit1 <- nnet::multinom(x_ ~ str_split(rownames(x_), '', simplify = TRUE)[ ,1] , trace = FALSE )
fit0 <- nnet::multinom(x_ ~ 1, trace = FALSE )
pval <- anova(fit1, fit0)$'LR stat.'[2] # test for differential 3'UTR usage (genotype effect)
} else {
pval <- NA
}
} else {
pval <- NA
}
return(pval)
}, x = ctab_list, y = anno_list, SIMPLIFY = TRUE )
score <- lrt # use it as a score to rank the genes for the GSEA method
names(score) <- str_split(names(score), '_', simplify = TRUE)[ ,1]
head(score)
# run the GSEA method
set.seed(1001)
cc_res <- gseGO( sort( abs(score), decreasing = T),
keyType = 'ENSEMBL', OrgDb = 'org.Mm.eg.db',
ont = 'CC', nPerm = 1e4,
pvalueCutoff = 1, minGSSize = 75, maxGSSize = 750)
str(cc_res@result)
# tranlsate into human ENTREZ identifier
require("biomaRt")
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
score_h <- getLDS(attributes = c("ensembl_gene_id"), filters = "ensembl_gene_id", values = names(score),
mart = mouse, attributesL = c("hgnc_symbol"), martL = human, uniqueRows=T)
# run the disease ontology analysis (DO) as for the in vivo analysis (data from kalamakis et al. 2019)... Disease Ontology
load(file = 'gsea_aplp1.rda') # pre-compiled results
do_sub <- mutate(do_sub, Description = factor(Description, levels = (Description)[order(NES)] ) )
ggplot( do_sub, aes(y = setSize, x = Description, fill = NES ) ) + geom_col() + coord_flip() +
ggtitle("GSEA (Rank by APLP1-/- vs. WT \n3'UTR Change LR Stat., FDR < 10%)") +
theme_minimal() +
labs(x = '', y = 'Size of Gene Sets') +
scale_fill_gradientn( colors = c('darkorchid1', 'darkorchid2', 'darkorchid3', 'darkorchid4', 'black') ) +
ggtitle('Disease Ontology (DO)')... now Gene Ontology
lsl_sub <- mutate(lsl_sub, Description = factor(Description, levels = (Description)[order(NES)] ) )
ggplot( lsl_sub, aes(y = setSize, x = Description, fill = NES ) ) + geom_col() + coord_flip() +
ggtitle("GSEA (Rank by longer 3'UTR\nAPLP1-/- vs. WT), FDR < 10%") + theme_minimal() +
labs(x = '', y = 'Size of Gene Sets') +
scale_fill_gradient(low = 'dodgerblue4', high = 'black') #+... intersection (of APLP1-/- affected genes) with CPEB4 and CPEB1 binders from Parras et al. 2018
HYPOTHESIS: APLP1 acts through CPEB4, thereby modulating 3'UTR usage
ggplot(rip_aplp1_tbl, aes(y = GeneRatio, x = GeneSet, fill = Binding)) +
theme_minimal() +
scale_fill_manual(values = c('darkorchid4', 'firebrick4', 'red' , 'grey') ) + #'dodgerblue',
scale_y_continuous(labels = scales::percent) +
geom_col(color = 'black') +
ggtitle('p-value < 0.001') +
labs(subtitle = 'RIP (Parras et al. 2018)',
y = "Gene Ratio [%]", x = "p-value" )Identification of CPE-flanked isoforms and their usage along the neural stem cell lineage (APLP1-/- vs. WT)
This chunks show the example code
Example 1: ... again MNR-splines in APLP1-WT
# compute MNR-splines (here for APLP1-/- as example)
mnrg_list_wt <- pbmapply(function(x, y, dof){
# compute the multinomial regression, try vgam
if( length(y$Genotype == 'WT') > 49 & sum(( colSums( x[y$Genotype == 'WT', ,drop = FALSE]) ) > 2) > 1 ){
x <- x[y$Genotype == 'WT', ]
y <- y[y$Genotype == 'WT', ]
# back-transform to get the splines with the nnet package
fit1 <- nnet::multinom(x ~ bs(y$pTime, df = dof), trace = FALSE )
fit0 <- nnet::multinom(x ~ 1, trace = FALSE )
m <- cbind( 1, bs( y$pTime, df = dof ) ) %*% t(coef(fit1))
m <- exp( cbind( iso1 = 0, m ) )
m <- m / rowSums(m)
r <- cor(y$pTime, m)[1, ]
colnames(m) <- seq_len(ncol(m))
m <- m[order(y$pTime), ]
# return the values of interest
out <- list(spline_matrix = m, pTime = y$pTime, spline_corr = r)
} else {
return(NA)
}
return(out)
}, x = ctab_list, y = anno_list, MoreArgs = list(dof = 3), SIMPLIFY = FALSE )Example 2: ... effect for CPE-flanked PASs
ix <- sapply(mnrg_list_wt, class) != 'logical'
cpe_wt <- mapply(function(x, y, z, s){
m <- 'TTTTGT|TTTTGAT|TTTTAGT'
loc <- (str_locate_all(y, m)[[1]][ ,1])
pos <- sapply(z, function(x, loc){ any(x - loc > 5 & x - loc < 55) }, loc)
th <- s > 79
pt <- sort(x$pTime)
corr <- apply(x$spline_matrix, 2, function(x){
cor(x[pt > 2.5], pt[pt > 2.5] ) } ) # This is for the aNSC to NB transition
pce <- corr[which(pos & th)]
ncpe <- corr[which(!pos & th)]
if( any(th) ){
return(list( Positive = pce[1], Negative = ncpe[1] ))
} else {
return(NULL)
}
},
y = apaFiltered@Sequence$Seqeuence[ix], s = supp_list[ix],
x = mnrg_list_wt[ix], z = peak_list[ix], SIMPLIFY = FALSE
) %>% unname()
anyPos <- lapply( cpe_wt, function(x){ is.numeric(x$Positive) &
length(x$Positive) > 0 } ) %>% unlist2()
pos <- lapply( cpe_wt, function(x){ x$Positive } )[ anyPos] %>% unlist2()
neg <- lapply( cpe_wt, function(x){ x$Negative } )[ anyPos] %>% unlist2()
pos <- pos[!is.na(pos)]
neg <- neg[!is.na(neg)]
ix <- sapply(mnrg_list_ko, class) != 'logical'
cpe_ko <- mapply(function(x, y, z, s){
m <- 'TTTTGT|TTTTGAT|TTTTAGT'
loc <- (str_locate_all(y, m)[[1]][ ,1])
pos <- sapply(z, function(x, loc){ any(x - loc > 5 & x - loc < 55) }, loc)
th <- s > 79
pt <- sort(x$pTime)
corr <- apply(x$spline_matrix, 2, function(x){
cor(x[pt > 2.5], pt[pt > 2.5] ) } )
pce <- corr[which(pos & th)]
ncpe <- corr[which(!pos & th)]
if( any(th) ){
return(list( Positive = pce[1], Negative = ncpe[1] ))
} else {
return(NULL)
}
},
y = apaFiltered@Sequence$Seqeuence[ix], s = supp_list[ix],
x = mnrg_list_ko[ix], z = peak_list[ix], SIMPLIFY = FALSE
) %>% unname()
anyPos <- lapply( cpe_ko, function(x){ is.numeric(x$Positive) &
length(x$Positive) > 0 } ) %>% unlist2()
pos_ <- lapply( cpe_ko, function(x){ x$Positive } )[ anyPos] %>% unlist2()
neg_ <- lapply( cpe_ko, function(x){ x$Negative } )[ anyPos] %>% unlist2()
pos_ <- pos[!is.na(pos_)]
neg_ <- neg[!is.na(neg_)]
cpe_cor_tbl_aplp_nb <- tibble( Cor = c(pos, neg, pos_, neg_),
Peak = c( rep('CPE', length(pos)), rep('No CPE', length(neg)),
rep('CPE', length(pos_)), rep('No CPE', length(neg_)) ),
Genotype = c( rep('WT', length(c(pos, neg)) ), rep('APLP1-/-', length(c(pos_, neg_)) ) )
)Usage of CPE-flanked PAS (APLP1-/- vs. WT, compare to the WT from Kalamakis et al. 2019)
load(file = 'cpe_cor_tbl_aplp.rda')
cpe_cor_tbl_aplp$Genotype <- factor(cpe_cor_tbl_aplp$Genotype,
levels = c('WT', 'APLP1-/-'))
fit1 <- glm(Cor ~ Peak + Genotype + Peak:Genotype, data = cpe_cor_tbl_aplp)
fit0 <- glm(Cor ~ Peak + Genotype, data = cpe_cor_tbl_aplp)
anova(fit1, fit0, test = 'Chisq')wt <- filter(cpe_cor_tbl_aplp, Genotype == "WT" & Peak == "CPE")$Cor
ko <- filter(cpe_cor_tbl_aplp, Genotype == "APLP1-/-" & Peak == "CPE")$Cor
wilcox.test(ko, wt)##
## Wilcoxon rank sum test with continuity correction
##
## data: ko and wt
## W = 44835, p-value = 0.0365
## alternative hypothesis: true location shift is not equal to 0
gg_cpe_trend <- ggplot(cpe_cor_tbl_aplp, aes(y = Cor, x = Genotype, color = Peak)) +
geom_violin(mapping = aes(fill = Peak), alpha = .3) +
geom_boxplot(width = .15, mapping = aes(fill = Peak), color = 'black' ) +
theme_classic() +
facet_grid(Peak~.) +
scale_color_manual(values = c('firebrick4', 'black')) +
scale_fill_manual(values = c('red', 'darkgrey')) +
geom_hline(yintercept = 0, alpha = .5, size = 0.25) +
guides(color = FALSE, fill = FALSE) +
labs(x = 'color code: as in Fig.2 e)\n Transition: qNSCs to aNSCs',
y = 'Usage of PAS vs. pseudotime\nincrease (> 0) or decrease(< 0)') +
ggtitle('p-value = 0.04')
gg_cpe_trend... fo the second transition (not evaluated in this script)
cpe_cor_tbl_aplp_nb$Genotype <- factor(cpe_cor_tbl_aplp_nb$Genotype,
levels = c('WT', 'APLP1-/-'))
fit1 <- glm(Cor ~ Peak + Genotype + Peak:Genotype, data = cpe_cor_tbl_aplp_nb)
fit0 <- glm(Cor ~ Peak + Genotype, data = cpe_cor_tbl_aplp_nb)
anova(fit1, fit0, test = 'Chisq')
wt <- filter(cpe_cor_tbl_aplp_nb, Genotype == "WT" & Peak == "CPE")$Cor
ko <- filter(cpe_cor_tbl_aplp_nb, Genotype == "APLP1-/-" & Peak == "CPE")$Cor
wilcox.test(ko, wt)
gg_cpe_trend <- ggplot(cpe_cor_tbl_aplp_nb, aes(y = Cor, x = Genotype, color = Peak)) +
geom_violin(mapping = aes(fill = Peak), alpha = .3) +
geom_boxplot(width = .15, mapping = aes(fill = Peak), color = 'black' ) +
theme_classic() +
facet_grid(Peak~.) +
scale_color_manual(values = c('firebrick4', 'black')) +
scale_fill_manual(values = c('red', 'darkgrey')) +
geom_hline(yintercept = 0, alpha = .5, size = 0.25) +
guides(color = FALSE, fill = FALSE) +
labs(x = 'Transition: aNSCs to NBs',
y = 'PAS Usage along the NSC lineage\nincrease (> 0) or decrease(< 0)') +
ggtitle('p-value = 0.04')
gg_cpe_trend... one can compare the differential 3'UTR usage in APLP1-/- (mouse) vs. the autism cohort (human)
This table contains some matched statistics and annotations of both analyses.
load(file = 'cross_new_master.rda')
cross_new_master